# import counts
countsData <- read.delim(file = "../01_input/all.counts", sep = " ")
# preview counts
head(countsData)
##                  chr start stop strand length embryo_cells_rep1
## WBGene00014450 MtDNA     1   55      +     55                 0
## WBGene00014451 MtDNA    58  111      +     54                 0
## WBGene00010957 MtDNA   113  549      +    437                 0
## WBGene00010958 MtDNA   549  783      +    235                 0
## WBGene00014452 MtDNA   785  840      +     56                 0
## WBGene00014453 MtDNA   842  896      +     55                 0
##                embryo_cells_rep2 embryo_GFPminus_rep1 embryo_GFPminus_rep2
## WBGene00014450                 0                    0                    0
## WBGene00014451                 0                    0                    0
## WBGene00010957                 0                    0                    0
## WBGene00010958                 0                    0                    0
## WBGene00014452                 0                    0                    0
## WBGene00014453                 0                    0                    0
##                embryo_GFPminus_rep3 embryo_GFPplus_rep1 embryo_GFPplus_rep2
## WBGene00014450                    0                   0                   0
## WBGene00014451                    0                   0                   0
## WBGene00010957                    0                   0                   0
## WBGene00010958                    0                   0                   0
## WBGene00014452                    0                   0                   0
## WBGene00014453                    0                   0                   0
##                embryo_GFPplus_rep3 embryo_whole_rep2 embryo_whole_rep3
## WBGene00014450                   0                 0                 0
## WBGene00014451                   0                 0                 0
## WBGene00010957                   0                 0                 0
## WBGene00010958                   0                 0                 0
## WBGene00014452                   0                 0                 0
## WBGene00014453                   0                 0                 0
##                L1_cells_rep1 L1_cells_rep2 L1_cells_rep3 L1_GFPminus_rep1
## WBGene00014450             0             0             0                0
## WBGene00014451             0             0             0                0
## WBGene00010957             0             0             0                0
## WBGene00010958             0             0             0                0
## WBGene00014452             0             0             0                0
## WBGene00014453             0             0             0                0
##                L1_GFPminus_rep2 L1_GFPminus_rep3 L1_GFPplus_rep1
## WBGene00014450                0                0               0
## WBGene00014451                0                0               0
## WBGene00010957                0                0               0
## WBGene00010958                0                0               0
## WBGene00014452                0                0               0
## WBGene00014453                0                0               0
##                L1_GFPplus_rep2 L1_GFPplus_rep3 L1_whole_rep1 L1_whole_rep2
## WBGene00014450               0               0             0             0
## WBGene00014451               0               0             0             0
## WBGene00010957               0               0             0             0
## WBGene00010958               0               0             0             0
## WBGene00014452               0               0             0             0
## WBGene00014453               0               0             0             0
##                L1_whole_rep3 L3_cells_rep1 L3_cells_rep2 L3_cells_rep3
## WBGene00014450             0             0             0             0
## WBGene00014451             0             0             0             0
## WBGene00010957             0             0             0             0
## WBGene00010958             0             0             0             0
## WBGene00014452             0             0             0             0
## WBGene00014453             0             0             0             0
##                L3_GFPminus_rep1 L3_GFPplus_rep2 L3_GFPminus_rep3
## WBGene00014450                0               0                0
## WBGene00014451                0               0                0
## WBGene00010957                0               0                0
## WBGene00010958                0               0                0
## WBGene00014452                0               0                0
## WBGene00014453                0               0                0
##                L3_GFPplus_rep1 L3_GFPminus_rep2 L3_GFPplus_rep3 L3_whole_rep1
## WBGene00014450               0                0               0             0
## WBGene00014451               0                0               0             0
## WBGene00010957               0                0               0             0
## WBGene00010958               0                0               0             0
## WBGene00014452               0                0               0             0
## WBGene00014453               0                0               0             0
##                L3_whole_rep2 L3_whole_rep3
## WBGene00014450             0             0
## WBGene00014451             0             0
## WBGene00010957             0             0
## WBGene00010958             0             0
## WBGene00014452             0             0
## WBGene00014453             0             0
# print samples
colnames(countsData[6:ncol(countsData)])
##  [1] "embryo_cells_rep1"    "embryo_cells_rep2"    "embryo_GFPminus_rep1"
##  [4] "embryo_GFPminus_rep2" "embryo_GFPminus_rep3" "embryo_GFPplus_rep1" 
##  [7] "embryo_GFPplus_rep2"  "embryo_GFPplus_rep3"  "embryo_whole_rep2"   
## [10] "embryo_whole_rep3"    "L1_cells_rep1"        "L1_cells_rep2"       
## [13] "L1_cells_rep3"        "L1_GFPminus_rep1"     "L1_GFPminus_rep2"    
## [16] "L1_GFPminus_rep3"     "L1_GFPplus_rep1"      "L1_GFPplus_rep2"     
## [19] "L1_GFPplus_rep3"      "L1_whole_rep1"        "L1_whole_rep2"       
## [22] "L1_whole_rep3"        "L3_cells_rep1"        "L3_cells_rep2"       
## [25] "L3_cells_rep3"        "L3_GFPminus_rep1"     "L3_GFPplus_rep2"     
## [28] "L3_GFPminus_rep3"     "L3_GFPplus_rep1"      "L3_GFPminus_rep2"    
## [31] "L3_GFPplus_rep3"      "L3_whole_rep1"        "L3_whole_rep2"       
## [34] "L3_whole_rep3"
# import metadata and process file
metadata1 <- read.table(file = "../01_input/RWP27_metadata.tsv", header = FALSE, stringsAsFactors = FALSE) %>% bind_rows(read.table(file = "../01_input/RWP26_metadata.tsv", header = FALSE, stringsAsFactors = FALSE)) %>%
  bind_rows(read.table(file = "../01_input/RWP30_metadata.tsv", header = FALSE, stringsAsFactors = FALSE))

colnames(metadata1) <- c("Filename.Fwd", "Filename.Rev", "names")
head(metadata1)
##           Filename.Fwd         Filename.Rev                names
## 1 RW57_S10_L003_R1_001 RW57_S10_L003_R2_001    embryo_cells_rep1
## 2 RW58_S11_L003_R1_001 RW58_S11_L003_R2_001  embryo_GFPplus_rep1
## 3 RW59_S12_L003_R1_001 RW59_S12_L003_R2_001 embryo_GFPminus_rep1
## 4 RW60_S13_L003_R1_001 RW60_S13_L003_R2_001    embryo_whole_rep2
## 5 RW61_S14_L003_R1_001 RW61_S14_L003_R2_001    embryo_cells_rep2
## 6 RW62_S15_L003_R1_001 RW62_S15_L003_R2_001  embryo_GFPplus_rep2
# separate and process sample info
metadata1 <- metadata1 %>% separate(names, sep = "_", into = c("stage", "sample", "rep"), remove = FALSE)
metadata1 <- metadata1 %>% mutate(stage = fct_relevel(stage, c("embryo", "L1", "L3")), 
                     sample = fct_relevel(sample, c("whole", "cells", "GFPplus", "GFPminus")),
                     rep = fct_relevel(rep, c("rep1", "rep2", "rep3")),
                     names = fct_relevel(names, metadata1$names)
                     )

# Order columns according to metadata1 order
countsData <- countsData  %>% select(chr:length, sort(metadata1$names))
head(countsData)
##                  chr start stop strand length embryo_cells_rep1
## WBGene00014450 MtDNA     1   55      +     55                 0
## WBGene00014451 MtDNA    58  111      +     54                 0
## WBGene00010957 MtDNA   113  549      +    437                 0
## WBGene00010958 MtDNA   549  783      +    235                 0
## WBGene00014452 MtDNA   785  840      +     56                 0
## WBGene00014453 MtDNA   842  896      +     55                 0
##                embryo_GFPplus_rep1 embryo_GFPminus_rep1 embryo_whole_rep2
## WBGene00014450                   0                    0                 0
## WBGene00014451                   0                    0                 0
## WBGene00010957                   0                    0                 0
## WBGene00010958                   0                    0                 0
## WBGene00014452                   0                    0                 0
## WBGene00014453                   0                    0                 0
##                embryo_cells_rep2 embryo_GFPplus_rep2 embryo_GFPminus_rep2
## WBGene00014450                 0                   0                    0
## WBGene00014451                 0                   0                    0
## WBGene00010957                 0                   0                    0
## WBGene00010958                 0                   0                    0
## WBGene00014452                 0                   0                    0
## WBGene00014453                 0                   0                    0
##                embryo_whole_rep3 embryo_GFPplus_rep3 embryo_GFPminus_rep3
## WBGene00014450                 0                   0                    0
## WBGene00014451                 0                   0                    0
## WBGene00010957                 0                   0                    0
## WBGene00010958                 0                   0                    0
## WBGene00014452                 0                   0                    0
## WBGene00014453                 0                   0                    0
##                L1_whole_rep1 L1_cells_rep1 L1_GFPplus_rep1 L1_GFPminus_rep1
## WBGene00014450             0             0               0                0
## WBGene00014451             0             0               0                0
## WBGene00010957             0             0               0                0
## WBGene00010958             0             0               0                0
## WBGene00014452             0             0               0                0
## WBGene00014453             0             0               0                0
##                L1_whole_rep2 L1_cells_rep2 L1_GFPplus_rep2 L1_GFPminus_rep2
## WBGene00014450             0             0               0                0
## WBGene00014451             0             0               0                0
## WBGene00010957             0             0               0                0
## WBGene00010958             0             0               0                0
## WBGene00014452             0             0               0                0
## WBGene00014453             0             0               0                0
##                L1_whole_rep3 L1_cells_rep3 L1_GFPplus_rep3 L1_GFPminus_rep3
## WBGene00014450             0             0               0                0
## WBGene00014451             0             0               0                0
## WBGene00010957             0             0               0                0
## WBGene00010958             0             0               0                0
## WBGene00014452             0             0               0                0
## WBGene00014453             0             0               0                0
##                L3_whole_rep1 L3_cells_rep1 L3_GFPplus_rep1 L3_GFPminus_rep1
## WBGene00014450             0             0               0                0
## WBGene00014451             0             0               0                0
## WBGene00010957             0             0               0                0
## WBGene00010958             0             0               0                0
## WBGene00014452             0             0               0                0
## WBGene00014453             0             0               0                0
##                L3_whole_rep2 L3_cells_rep2 L3_GFPminus_rep2 L3_GFPplus_rep2
## WBGene00014450             0             0                0               0
## WBGene00014451             0             0                0               0
## WBGene00010957             0             0                0               0
## WBGene00010958             0             0                0               0
## WBGene00014452             0             0                0               0
## WBGene00014453             0             0                0               0
##                L3_whole_rep3 L3_cells_rep3 L3_GFPplus_rep3 L3_GFPminus_rep3
## WBGene00014450             0             0               0                0
## WBGene00014451             0             0               0                0
## WBGene00010957             0             0               0                0
## WBGene00010958             0             0               0                0
## WBGene00014452             0             0               0                0
## WBGene00014453             0             0               0                0
# Generate a table called "cts" out of the countsData table. Subset the countsData.
cts <- as.matrix(countsData %>% select(metadata1$names))
head(cts)
##                embryo_cells_rep1 embryo_GFPplus_rep1 embryo_GFPminus_rep1
## WBGene00014450                 0                   0                    0
## WBGene00014451                 0                   0                    0
## WBGene00010957                 0                   0                    0
## WBGene00010958                 0                   0                    0
## WBGene00014452                 0                   0                    0
## WBGene00014453                 0                   0                    0
##                embryo_whole_rep2 embryo_cells_rep2 embryo_GFPplus_rep2
## WBGene00014450                 0                 0                   0
## WBGene00014451                 0                 0                   0
## WBGene00010957                 0                 0                   0
## WBGene00010958                 0                 0                   0
## WBGene00014452                 0                 0                   0
## WBGene00014453                 0                 0                   0
##                embryo_GFPminus_rep2 embryo_whole_rep3 embryo_GFPplus_rep3
## WBGene00014450                    0                 0                   0
## WBGene00014451                    0                 0                   0
## WBGene00010957                    0                 0                   0
## WBGene00010958                    0                 0                   0
## WBGene00014452                    0                 0                   0
## WBGene00014453                    0                 0                   0
##                embryo_GFPminus_rep3 L1_whole_rep1 L1_cells_rep1 L1_GFPplus_rep1
## WBGene00014450                    0             0             0               0
## WBGene00014451                    0             0             0               0
## WBGene00010957                    0             0             0               0
## WBGene00010958                    0             0             0               0
## WBGene00014452                    0             0             0               0
## WBGene00014453                    0             0             0               0
##                L1_GFPminus_rep1 L1_whole_rep2 L1_cells_rep2 L1_GFPplus_rep2
## WBGene00014450                0             0             0               0
## WBGene00014451                0             0             0               0
## WBGene00010957                0             0             0               0
## WBGene00010958                0             0             0               0
## WBGene00014452                0             0             0               0
## WBGene00014453                0             0             0               0
##                L1_GFPminus_rep2 L1_whole_rep3 L1_cells_rep3 L1_GFPplus_rep3
## WBGene00014450                0             0             0               0
## WBGene00014451                0             0             0               0
## WBGene00010957                0             0             0               0
## WBGene00010958                0             0             0               0
## WBGene00014452                0             0             0               0
## WBGene00014453                0             0             0               0
##                L1_GFPminus_rep3 L3_whole_rep1 L3_cells_rep1 L3_GFPplus_rep1
## WBGene00014450                0             0             0               0
## WBGene00014451                0             0             0               0
## WBGene00010957                0             0             0               0
## WBGene00010958                0             0             0               0
## WBGene00014452                0             0             0               0
## WBGene00014453                0             0             0               0
##                L3_GFPminus_rep1 L3_whole_rep2 L3_cells_rep2 L3_GFPminus_rep2
## WBGene00014450                0             0             0                0
## WBGene00014451                0             0             0                0
## WBGene00010957                0             0             0                0
## WBGene00010958                0             0             0                0
## WBGene00014452                0             0             0                0
## WBGene00014453                0             0             0                0
##                L3_GFPplus_rep2 L3_whole_rep3 L3_cells_rep3 L3_GFPplus_rep3
## WBGene00014450               0             0             0               0
## WBGene00014451               0             0             0               0
## WBGene00010957               0             0             0               0
## WBGene00010958               0             0             0               0
## WBGene00014452               0             0             0               0
## WBGene00014453               0             0             0               0
##                L3_GFPminus_rep3
## WBGene00014450                0
## WBGene00014451                0
## WBGene00010957                0
## WBGene00010958                0
## WBGene00014452                0
## WBGene00014453                0
# Reorganize the metadata table so the names2 column are now headers
rownames(metadata1)<- metadata1$names
coldata <- metadata1[,c("names", "stage", "sample", "rep")]
rownames(coldata) <- as.vector(metadata1$names)
# make grouping variable
coldata$group <- factor(paste0(coldata$stage, coldata$sample))
coldata
##                                     names  stage   sample  rep          group
## embryo_cells_rep1       embryo_cells_rep1 embryo    cells rep1    embryocells
## embryo_GFPplus_rep1   embryo_GFPplus_rep1 embryo  GFPplus rep1  embryoGFPplus
## embryo_GFPminus_rep1 embryo_GFPminus_rep1 embryo GFPminus rep1 embryoGFPminus
## embryo_whole_rep2       embryo_whole_rep2 embryo    whole rep2    embryowhole
## embryo_cells_rep2       embryo_cells_rep2 embryo    cells rep2    embryocells
## embryo_GFPplus_rep2   embryo_GFPplus_rep2 embryo  GFPplus rep2  embryoGFPplus
## embryo_GFPminus_rep2 embryo_GFPminus_rep2 embryo GFPminus rep2 embryoGFPminus
## embryo_whole_rep3       embryo_whole_rep3 embryo    whole rep3    embryowhole
## embryo_GFPplus_rep3   embryo_GFPplus_rep3 embryo  GFPplus rep3  embryoGFPplus
## embryo_GFPminus_rep3 embryo_GFPminus_rep3 embryo GFPminus rep3 embryoGFPminus
## L1_whole_rep1               L1_whole_rep1     L1    whole rep1        L1whole
## L1_cells_rep1               L1_cells_rep1     L1    cells rep1        L1cells
## L1_GFPplus_rep1           L1_GFPplus_rep1     L1  GFPplus rep1      L1GFPplus
## L1_GFPminus_rep1         L1_GFPminus_rep1     L1 GFPminus rep1     L1GFPminus
## L1_whole_rep2               L1_whole_rep2     L1    whole rep2        L1whole
## L1_cells_rep2               L1_cells_rep2     L1    cells rep2        L1cells
## L1_GFPplus_rep2           L1_GFPplus_rep2     L1  GFPplus rep2      L1GFPplus
## L1_GFPminus_rep2         L1_GFPminus_rep2     L1 GFPminus rep2     L1GFPminus
## L1_whole_rep3               L1_whole_rep3     L1    whole rep3        L1whole
## L1_cells_rep3               L1_cells_rep3     L1    cells rep3        L1cells
## L1_GFPplus_rep3           L1_GFPplus_rep3     L1  GFPplus rep3      L1GFPplus
## L1_GFPminus_rep3         L1_GFPminus_rep3     L1 GFPminus rep3     L1GFPminus
## L3_whole_rep1               L3_whole_rep1     L3    whole rep1        L3whole
## L3_cells_rep1               L3_cells_rep1     L3    cells rep1        L3cells
## L3_GFPplus_rep1           L3_GFPplus_rep1     L3  GFPplus rep1      L3GFPplus
## L3_GFPminus_rep1         L3_GFPminus_rep1     L3 GFPminus rep1     L3GFPminus
## L3_whole_rep2               L3_whole_rep2     L3    whole rep2        L3whole
## L3_cells_rep2               L3_cells_rep2     L3    cells rep2        L3cells
## L3_GFPminus_rep2         L3_GFPminus_rep2     L3 GFPminus rep2     L3GFPminus
## L3_GFPplus_rep2           L3_GFPplus_rep2     L3  GFPplus rep2      L3GFPplus
## L3_whole_rep3               L3_whole_rep3     L3    whole rep3        L3whole
## L3_cells_rep3               L3_cells_rep3     L3    cells rep3        L3cells
## L3_GFPplus_rep3           L3_GFPplus_rep3     L3  GFPplus rep3      L3GFPplus
## L3_GFPminus_rep3         L3_GFPminus_rep3     L3 GFPminus rep3     L3GFPminus
# Check that the names match  --> Should be TRUE
all(rownames(coldata) == colnames(cts))
## [1] TRUE

Make DESeqDataSet

Generate the DESeqDataSet. The variables in this design formula will be the type of sample, and the preparation date. This should reduce the variability between the samples based on when they were made.

From the vignette: “In order to benefit from the default settings of the package, you should put the variable of interest at the end of the formula and make sure the control level is the first level.”

The variable of interest is the sample type.

Using DESeqDataSetFromMatrix since I used the program featureCounts.

dds <- DESeqDataSetFromMatrix(countData = cts,
                              colData = coldata,
                              design = ~ group)

Visualize read count distribution

hist(log(rowSums(counts(dds))))
abline(v = log(10), col = "red", lty = 2)

Filter genes with low read counts

keep <- rowSums(counts(dds)) >=10
dds <- dds[keep,]
dds
## class: DESeqDataSet 
## dim: 26557 34 
## metadata(1): version
## assays(1): counts
## rownames(26557): WBGene00021406 WBGene00021407 ... WBGene00199694
##   WBGene00044951
## rowData names(0):
## colnames(34): embryo_cells_rep1 embryo_GFPplus_rep1 ... L3_GFPplus_rep3
##   L3_GFPminus_rep3
## colData names(5): names stage sample rep group

Perform Differential Expression

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds)
##  [1] "Intercept"                           "group_embryoGFPminus_vs_embryocells"
##  [3] "group_embryoGFPplus_vs_embryocells"  "group_embryowhole_vs_embryocells"   
##  [5] "group_L1cells_vs_embryocells"        "group_L1GFPminus_vs_embryocells"    
##  [7] "group_L1GFPplus_vs_embryocells"      "group_L1whole_vs_embryocells"       
##  [9] "group_L3cells_vs_embryocells"        "group_L3GFPminus_vs_embryocells"    
## [11] "group_L3GFPplus_vs_embryocells"      "group_L3whole_vs_embryocells"

Sample-to-sample distance matrix

vsd <- vst(dds, blind = FALSE)
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- vsd$names
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists,
         col = colors)

vsd.corr.per.stage <- function(x, main){
vsd <- assay(vsd)[,metadata1 %>% filter(grepl(x, names)) %>% pull(names)]
sampleDists <- dist(t(vsd))
sampleDistMatrix <- as.matrix(sampleDists)
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists,
         col = colors, 
         main = main)
}
vsd.corr.per.stage("embryo", "Embryo Stage Intestine FACS RNA-seq Correlation")

vsd.corr.per.stage("L1", "L1 Stage Intestine FACS RNA-seq Correlation")

vsd.corr.per.stage("L3", "L3 Stage Intestine FACS RNA-seq Correlation")

All embryo stage pairwise comparisons

Code testing, preliminary visualization

#for loop attempt one
combinations<- data.frame(combos = 1:4, top = c("embryowhole", "embryoGFPplus", "embryoGFPminus", "embryoGFPplus"), bottom = c("embryocells", "embryocells", "embryocells", "embryoGFPminus"))

res_embryowhole_vs_embryocells <- results(dds, contrast = c("group", combinations$top[1], combinations$bottom[1]))
res_embryowhole_vs_embryocells_shrunk <- lfcShrink(dds, contrast = c("group", combinations$top[1], combinations$bottom[1]), type = "ashr", quiet = TRUE)

res_embryowhole_vs_embryocells <- results(dds, contrast = c("group", combinations$top[1], combinations$bottom[1]))
res_embryowhole_vs_embryocells_shrunk <- lfcShrink(dds, contrast = c("group", combinations$top[1], combinations$bottom[1]), type = "ashr", quiet = TRUE)

for(i in 1:nrow(combinations)){
  res.var.name<- paste("res",combinations$top[i], "vs", combinations$bottom[i], sep = "_")
  res.shrunk.var.name <- paste("res",combinations$top[i], "vs", combinations$bottom[i], "shrunk", sep = "_")
  print(res.var.name)
  print(res.shrunk.var.name)
  assign(res.var.name, results(dds, contrast = c("group", combinations$top[i], combinations$bottom[i])))
}
## [1] "res_embryowhole_vs_embryocells"
## [1] "res_embryowhole_vs_embryocells_shrunk"
## [1] "res_embryoGFPplus_vs_embryocells"
## [1] "res_embryoGFPplus_vs_embryocells_shrunk"
## [1] "res_embryoGFPminus_vs_embryocells"
## [1] "res_embryoGFPminus_vs_embryocells_shrunk"
## [1] "res_embryoGFPplus_vs_embryoGFPminus"
## [1] "res_embryoGFPplus_vs_embryoGFPminus_shrunk"
# res_embryoGFPplus_vs_embryoGFPminus
# plotMA(res_embryowhole_vs_embryocells_shrunk, ylim = ylim, main = "whole vs cells")
# combinations$top[1]
# class(res_embryowhole_vs_embryocells)

# for loop attempt two
samples <- paste("embryo", c("whole", "GFPplus", "cells", "GFPminus"), sep = "")
combos <- combn(samples, 2, simplify = FALSE)

for(i in 1:length(combos)){
  print(c(combos[[i]][1], combos[[i]][2]))
  res.var.name<- paste("res",combos[[i]][1], "vs", combos[[i]][2], sep = "_")
  res.shrunk.var.name <- paste("res",combos[[i]][1], "vs", combos[[i]][2], "shrunk", sep = "_")
  print(res.var.name)
  print(res.shrunk.var.name)
  assign(res.var.name, results(dds, contrast = c("group", combos[[i]][1], combos[[i]][2])))
  assign(res.shrunk.var.name, lfcShrink(dds, contrast = c("group", combos[[i]][1], combos[[i]][2]), type = "ashr", quiet = TRUE))
}
## [1] "embryowhole"   "embryoGFPplus"
## [1] "res_embryowhole_vs_embryoGFPplus"
## [1] "res_embryowhole_vs_embryoGFPplus_shrunk"
## [1] "embryowhole" "embryocells"
## [1] "res_embryowhole_vs_embryocells"
## [1] "res_embryowhole_vs_embryocells_shrunk"
## [1] "embryowhole"    "embryoGFPminus"
## [1] "res_embryowhole_vs_embryoGFPminus"
## [1] "res_embryowhole_vs_embryoGFPminus_shrunk"
## [1] "embryoGFPplus" "embryocells"  
## [1] "res_embryoGFPplus_vs_embryocells"
## [1] "res_embryoGFPplus_vs_embryocells_shrunk"
## [1] "embryoGFPplus"  "embryoGFPminus"
## [1] "res_embryoGFPplus_vs_embryoGFPminus"
## [1] "res_embryoGFPplus_vs_embryoGFPminus_shrunk"
## [1] "embryocells"    "embryoGFPminus"
## [1] "res_embryocells_vs_embryoGFPminus"
## [1] "res_embryocells_vs_embryoGFPminus_shrunk"
combos
## [[1]]
## [1] "embryowhole"   "embryoGFPplus"
## 
## [[2]]
## [1] "embryowhole" "embryocells"
## 
## [[3]]
## [1] "embryowhole"    "embryoGFPminus"
## 
## [[4]]
## [1] "embryoGFPplus" "embryocells"  
## 
## [[5]]
## [1] "embryoGFPplus"  "embryoGFPminus"
## 
## [[6]]
## [1] "embryocells"    "embryoGFPminus"
par(mfrow = c(3, 3), mar=c(2,2,1,1))
ylim = c(-10,10)
plotMA(res_embryowhole_vs_embryocells_shrunk, ylim = ylim, main = "whole vs cells")
plotMA(res_embryowhole_vs_embryoGFPplus_shrunk, ylim = ylim, main = "whole vs GFP+")
plotMA(res_embryowhole_vs_embryoGFPminus_shrunk, ylim = ylim, main = "whole vs GFP-")
plotMA(res_embryoGFPplus_vs_embryocells_shrunk, ylim = ylim, main = "GFP+ vs cells")
plotMA(res_embryoGFPplus_vs_embryoGFPminus_shrunk, ylim = ylim, main = "GFP+ vs GFP-")
plotMA(res_embryocells_vs_embryoGFPminus_shrunk, ylim = ylim, main = "cells vs GFP-")

ggplot version of plotting

all_embryo_comparisons <- data.frame()
for(i in 1:length(combos)){
tobind <- as.data.frame(lfcShrink(dds, contrast = c("group", combos[[i]][1], combos[[i]][2]), type = "ashr", quiet = TRUE)) %>% 
  rownames_to_column(var = "WBGeneID") %>% 
  mutate(comparison = paste(combos[[i]][1], combos[[i]][2], sep = "_vs_")) 
all_embryo_comparisons <- bind_rows(all_embryo_comparisons, tobind)
}


ggplot(all_embryo_comparisons %>% filter(!is.na(padj)), aes(x = log10(baseMean), y = log2FoldChange, color = padj < 0.1)) +
  geom_point(shape = 16, alpha = 0.1, stroke = 0, size = 1) +
  ylim(c(-10,10))+
  facet_wrap(~comparison) +
  scale_color_manual(values = c("black", "red"), name = "q.value < 0.1") +
  theme_classic()
## Warning: Removed 19 rows containing missing values (geom_point).

# Functions

pairwise_array_df <- function(stage) {
  samples <- paste(stage, c("whole", "GFPplus", "cells", "GFPminus"), sep = "")
  combos <- combn(samples, 2, simplify = FALSE)
  # print(combos)
  all_pairwise_comparisons <- data.frame()
  for (i in 1:length(combos)) {
    tobind <-
      as.data.frame(lfcShrink(
        dds,
        contrast = c("group", combos[[i]][1], combos[[i]][2]),
        type = "ashr",
        quiet = TRUE
      )) %>%
      rownames_to_column(var = "WBGeneID") %>%
      mutate(comparison = paste(combos[[i]][1], combos[[i]][2], sep = "_vs_"))  %>% 
      mutate(label = str_remove_all(comparison, "embryo|L1|L3"))
    all_pairwise_comparisons <- bind_rows(all_pairwise_comparisons, tobind)
  }
  all_pairwise_comparisons
}

MA_plot_array <- function(in.df, title){
  ggplot(in.df %>% mutate(padj = replace_na(padj, 1)), aes(x = log10(baseMean), y = log2FoldChange, color = padj < 0.1)) +
  geom_point(shape = 16, alpha = 0.1, stroke = 0, size = 1) +
  ylim(c(-10,10))+
  facet_wrap(~label) +
  scale_color_manual(values = c("black", "red"), name = "q.value < 0.1") +
  theme_classic() +
  ggtitle(title)
}

alt_hyp_res_df <- function(stage){
samples <- paste(stage, c("whole", "GFPplus", "cells", "GFPminus"), sep = "")
combos <- combn(samples, 2, simplify = FALSE)
thresh = 0.5
sig = 0.01
hyps = c("greater", "less", "lessAbs")
df <- data.frame()
for(i in 1:length(combos)){
  for(hyp in hyps){
    thresh_res <- results(dds, contrast = c("group", combos[[i]][1],combos[[i]][2]), lfcThreshold=thresh, altHypothesis = hyp, alpha = sig)
    tobind<-data.frame(plotMA(thresh_res, returnData = TRUE), 
                      comparison = paste(combos[[i]][1], combos[[i]][2], sep = "_vs_"), 
                       type = hyp) %>% mutate(label = str_remove_all(comparison, "embryo|L1|L3"))
    df <- bind_rows(df, tobind)
  }
}
df
}

de_category_MA_plot <- function(df, title){
df %>% filter(isDE == TRUE) %>%
  ggplot(aes(x = log10(mean), y = lfc, color = type)) +
  geom_point(data =df, shape = 16, alpha = 0.01, stroke = 0, size = 1, color = "grey") +
  geom_point(shape = 16, alpha = 0.5, stroke = 0, size = 1) +
  ylim(c(-10,10))+
  facet_wrap(~label) +
  # scale_color_manual(values = c("black", "red"), name = "q.value < 0.1") +
  theme_classic() +
    ggtitle(title)
}

options(dplyr.summarise.inform = FALSE)
de_category_bar_plot <- function(df, title){
  df %>% filter(isDE == TRUE) %>% group_by(label, type) %>% summarize(genes = n()) %>%
  ggplot(aes(x = type, y = genes, label = genes, fill = type)) +
  geom_bar(stat = "identity") +
  geom_text(vjust = -0.25) +
  facet_wrap(~label) +
    theme_classic() +
    ggtitle(title)
}
embryo_alt_hyp_res_df <- alt_hyp_res_df("embryo")
thresh = 0.5
sig = 0.01
de_category_MA_plot(embryo_alt_hyp_res_df, paste("Embryo differentially expressed genes\nlfc = ",thresh," & padj < ",sig, sep = ""))
## Warning: Removed 195 rows containing missing values (geom_point).
## Warning: Removed 44 rows containing missing values (geom_point).

de_category_bar_plot(embryo_alt_hyp_res_df, paste("Embryo differentially expressed genes\nlfc = ",thresh," & padj < ",sig, sep = ""))

L1_alt_hyp_res_df<- alt_hyp_res_df("L1")
de_category_MA_plot(L1_alt_hyp_res_df, paste("L1 differentially expressed genes\nlfc = ",thresh," & padj < ",sig, sep = ""))
## Warning: Removed 12 rows containing missing values (geom_point).

de_category_bar_plot(L1_alt_hyp_res_df, paste("L1 differentially expressed genes\nlfc = ",thresh," & padj < ",sig, sep = ""))

L3_alt_hyp_res_df<- alt_hyp_res_df("L3")
de_category_MA_plot(L3_alt_hyp_res_df, paste("L3 differentially expressed genes\nlfc = ",thresh," & padj < ",sig, sep = ""))
## Warning: Removed 135 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).

de_category_bar_plot(L3_alt_hyp_res_df, paste("L3 differentially expressed genes\nlfc = ",thresh," & padj < ",sig, sep = ""))

Embryo Intestine FACS MA plots

embryo_pairwise_res_shrunk <- pairwise_array_df(stage = "embryo")

MA_plot_array(embryo_pairwise_res_shrunk, "embryo FACS ashr shrunk")
## Warning: Removed 21 rows containing missing values (geom_point).

L1 Intestine FACS MA plots

L1_pairwise_res_shrunk <- pairwise_array_df(stage = "L1")

MA_plot_array(L1_pairwise_res_shrunk, "L1 FACS ashr shrunk")
## Warning: Removed 3 rows containing missing values (geom_point).

L3 Intestine FACS MA plots

L3_pairwise_res_shrunk <- pairwise_array_df(stage = "L3")
MA_plot_array(L3_pairwise_res_shrunk, "L3 FACS ashr shrunk")
## Warning: Removed 33 rows containing missing values (geom_point).

DESeq2 builtin plotting function

res_embryoGFPplus_vs_embryoGFPminus <- results(dds, contrast = c("group", "embryoGFPplus", "embryoGFPminus"))
res_L1GFPplus_vs_L1_GFPminus <- results(dds, contrast = c("group", "L1GFPplus", "L1GFPminus"))
res_L3GFPplus_vs_L3_GFPminus <- results(dds, contrast = c("group", "L3GFPplus", "L3GFPminus"))
par(mfrow=c(1,3),mar=c(2,2,1,1))
ylim <- c(-15,15)
# drawLines <- function() abline(h=c(-2,2),col="dodgerblue",lwd=2)
sig = 0.01
plotMA(res_embryoGFPplus_vs_embryoGFPminus, ylim=ylim, main = "Embryo GFP+ vs GFP-", alpha = sig)
plotMA(res_L1GFPplus_vs_L1_GFPminus, ylim=ylim, main = "L1 GFP+ vs GFP-", alpha = sig)
plotMA(res_L3GFPplus_vs_L3_GFPminus, ylim=ylim, main = "L3 GFP+ vs GFP-", alpha = sig)

res_embryoGFPplus_vs_embryoGFPminus_apeglm <- lfcShrink(dds, contrast = c("group", "embryoGFPplus", "embryoGFPminus"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
res_L1GFPplus_vs_L1GFPminus_apeglm <- lfcShrink(dds, contrast = c("group", "L1GFPplus", "L1GFPminus"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
res_L3GFPplus_vs_L3GFPminus_apeglm <- lfcShrink(dds, contrast = c("group", "L3GFPplus", "L3GFPminus"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
par(mfrow=c(1,3),mar=c(2,2,1,1))
ylim <- c(-10,10)
# drawLines <- function() abline(h=c(-2,2),col="dodgerblue",lwd=2)
sig = 0.01
plotMA(res_embryoGFPplus_vs_embryoGFPminus_apeglm, ylim=ylim, main = "Embryo GFP+ vs GFP-", alpha = sig)
plotMA(res_L1GFPplus_vs_L1GFPminus_apeglm, ylim=ylim, main = "L1 GFP+ vs GFP-", alpha = sig)
plotMA(res_L3GFPplus_vs_L3GFPminus_apeglm, ylim=ylim, main = "L3 GFP+ vs GFP-", alpha = sig)

plotCounts(dds, "WBGene00001578", intgroup = "group", main = "ges-1 read counts")

plotCounts(dds, "WBGene00001578", intgroup = "group", returnData = TRUE) %>% 
  separate(group, sep = "(?<=embryo)|(?<=L1)|(?<=L3)", into = c("stage", "sample"), remove = FALSE) %>%
  ggplot(aes(x = sample, y = count)) +
  geom_boxplot() +
  geom_point() +
  facet_grid(~stage) +
  ggtitle("ges-1 read counts") +
  theme_classic()

plotCounts(dds, "WBGene00001250", intgroup = "group", main = "elt-2 read counts")

plotCounts(dds, "WBGene00001250", intgroup = "group", returnData = TRUE) %>% 
  separate(group, sep = "(?<=embryo)|(?<=L1)|(?<=L3)", into = c("stage", "sample"), remove = FALSE) %>%
  ggplot(aes(x = sample, y = count)) +
  geom_boxplot() +
  geom_point() +
  facet_grid(~stage) +
  ggtitle("elt-2 read counts") +
  theme_classic()

# Annotate and quantify tissue specific genes

tissue_specific_genes <- read_csv(file = "../../01_tissue_specific_genes/03_output/tissue_specific_genes_220202.csv", show_col_types = FALSE)


tissue_annotated_MA <- function(in_res){
df <- as.data.frame(in_res) %>% rownames_to_column(var = "WBGeneID") %>%
  left_join(tissue_specific_genes, by = "WBGeneID") %>%
  mutate(tissue = replace_na(tissue, "other"))

df %>%
  ggplot(aes(x = log10(baseMean), y = log2FoldChange, color = tissue)) +
  geom_point(data =df %>% select(-tissue), shape = 16, alpha = 0.1, stroke = 0, size = 1, color = "grey") +
  geom_point(shape = 16, alpha = 0.5, stroke = 0, size = 1) +
  facet_wrap(~tissue) +
  ylim(c(-10,10)) +
  theme_classic()
}
tissue_annotated_MA(res_embryoGFPplus_vs_embryoGFPminus_shrunk)
## Warning: Removed 108 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_point).

tissue_annotated_MA(res_L1GFPplus_vs_L1GFPminus_apeglm)
## Warning: Removed 9 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).

tissue_annotated_MA(res_L3GFPplus_vs_L3GFPminus_apeglm)
## Warning: Removed 54 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).

L3_apeglm_df <- as.data.frame(res_L3GFPplus_vs_L3GFPminus_apeglm) %>% rownames_to_column(var = "WBGeneID") %>%
  left_join(tissue_specific_genes, by = "WBGeneID") %>%
  mutate(tissue = replace_na(tissue, "other"), padj = replace_na(padj, 1))

L3_apeglm_df %>%
  ggplot(aes(x = log10(baseMean), y = log2FoldChange, color = (log2FoldChange>0.5 & padj < 0.01)|(log2FoldChange<0.5 & padj < 0.01))) +
  # geom_point(data =df %>% select(-tissue), shape = 16, alpha = 0.1, stroke = 0, size = 1, color = "grey") +
  geom_point(shape = 16, alpha = 0.5, stroke = 0, size = 1) +
  # facet_wrap(~tissue) +
  ylim(c(-10,10)) +
  theme_classic()
## Warning: Removed 6 rows containing missing values (geom_point).

tissue_gene_quant <- function(res, sig = 0.01, thresh = 0.5){
  my_plot <- as.data.frame(res) %>% rownames_to_column(var = "WBGeneID") %>%
  left_join(tissue_specific_genes, by = "WBGeneID") %>%
  mutate(tissue = replace_na(tissue, "other"), padj = replace_na(padj, 1)) %>% mutate(status = case_when(
  log2FoldChange > thresh & padj < sig ~ "greater",
  log2FoldChange < thresh & padj < sig ~ "less",
  TRUE ~ "no_sig_diff"
)) %>%
  group_by(tissue, status) %>%
  summarise(genes = n()) %>%
  ggplot(aes(x = status, y = genes, label = genes, fill = status)) +
  geom_bar(stat = "identity") +
  geom_text(vjust = -0.25) +
  facet_wrap(~tissue)+
  ggtitle(paste("comparison: ",deparse(substitute(res)), "\nlfc = ",thresh," & padj < ",sig, sep = "")) +
    theme_classic()
  my_plot
}
tissue_gene_quant(res_embryoGFPplus_vs_embryoGFPminus)

tissue_gene_quant(res_L1GFPplus_vs_L1_GFPminus)

tissue_gene_quant(res_L3GFPplus_vs_L3_GFPminus)

Export rlog counts

# all_samples_rld <- rlog(dds)
# write_rds(all_samples_rld, file = "../03_output/all_samples_rlog_counts.rds")
all_samples_rld <- read_rds(file = "../03_output/all_samples_rlog_counts.rds")
all_samples_rld_df <- as.data.frame(assay(all_samples_rld)) %>% rownames_to_column(var = "WBGeneID")
head(all_samples_rld_df)
##         WBGeneID embryo_cells_rep1 embryo_GFPplus_rep1 embryo_GFPminus_rep1
## 1 WBGene00021406         6.2555820           6.2404923            6.7264055
## 2 WBGene00021407         1.6270285           3.7627995            3.1018264
## 3 WBGene00021408         4.3899448           3.9906455           -0.1945006
## 4 WBGene00021405         1.4149669           1.1242521           -1.0018001
## 5 WBGene00021409        -1.6479824           0.4976835           -1.6497436
## 6 WBGene00021404         0.7823288           4.8049143           -0.3104858
##   embryo_whole_rep2 embryo_cells_rep2 embryo_GFPplus_rep2 embryo_GFPminus_rep2
## 1         5.6074144         6.0514189           7.2077059           5.63651375
## 2         3.2155408         2.2819475           0.5322060           0.81145980
## 3         0.8772969         1.9119379           4.1867386           0.15942989
## 4        -1.0011478         0.0761009           0.4539224          -0.78371365
## 5        -0.1199332        -1.0495753          -0.6318372          -1.43141907
## 6         2.2426150         1.3076570           3.9963640           0.02150463
##   embryo_whole_rep3 embryo_GFPplus_rep3 embryo_GFPminus_rep3 L1_whole_rep1
## 1         6.3946378           6.1701351            5.3371260     6.2942743
## 2         5.1111637           2.3443024            2.9145740     5.4927342
## 3         1.0016648           3.6808455           -0.1278326     1.7665045
## 4        -0.2411126           0.5100072           -0.9662199     0.0744332
## 5        -1.6300072          -1.5005701           -1.6314758    -0.6986795
## 6         0.8948767           3.9376878           -0.2481658     0.7734699
##   L1_cells_rep1 L1_GFPplus_rep1 L1_GFPminus_rep1 L1_whole_rep2 L1_cells_rep2
## 1      6.215265       7.1523276      5.475690825      6.871012      7.349364
## 2      2.059112       0.6688776      3.345809230      6.090458      4.531783
## 3      2.323949       4.7642525     -0.003068489      3.753990      3.660362
## 4     -0.289981       1.3391497     -0.897876712      1.031559     -1.017490
## 5     -1.641468      -0.4364817     -1.576402672     -1.636860     -1.657334
## 6      2.560664       4.4249224      0.648784335      2.155108      2.481750
##   L1_GFPplus_rep2 L1_GFPminus_rep2 L1_whole_rep3 L1_cells_rep3 L1_GFPplus_rep3
## 1       8.1375056        7.0446586      8.754827     5.2944911      5.88019736
## 2       3.6789190        4.3397281      3.750676     0.9885366      0.96793166
## 3       5.8221251        0.2831530      2.832719     2.7131312      2.66977036
## 4       2.6220324       -0.4780549      1.949497    -0.5798993      1.77456758
## 5       0.4764585       -1.6786929     -1.518256    -0.4614950     -0.03097584
## 6       5.0411874        0.5600313      2.919045     2.3745253      2.05119473
##   L1_GFPminus_rep3 L3_whole_rep1 L3_cells_rep1 L3_GFPplus_rep1 L3_GFPminus_rep1
## 1        5.5231051      5.382862    6.15268861       5.3946695       4.51845807
## 2        2.0066513      4.187100    2.75679809       2.5745843       2.55163090
## 3        1.5334515      0.281495    0.10793873       3.4324436       0.08815672
## 4       -0.4164673      2.040954   -0.83554632       0.1866969      -0.84673921
## 5       -1.1474733     -1.325061   -1.47682440      -1.3938236      -1.49438437
## 6        1.4064931      1.083438   -0.02692795       1.8255081       0.78718356
##   L3_whole_rep2 L3_cells_rep2 L3_GFPminus_rep2 L3_GFPplus_rep2 L3_whole_rep3
## 1  5.2998693418     5.0059836        6.5019362       6.6332685     7.2836887
## 2  3.7948549967     1.6479563        1.7156520       1.9227498     4.1187151
## 3  0.9777458651     0.2512236        0.2966905       1.4461901     0.2238484
## 4 -0.8104324453    -0.6801062       -0.6291386      -0.4768328    -0.7108882
## 5 -1.4520773763    -1.3513123       -1.3119054      -1.1941464    -1.3751122
## 6 -0.0005880635     0.1078969        0.1507012       0.2790291     0.9945243
##   L3_cells_rep3 L3_GFPplus_rep3 L3_GFPminus_rep3
## 1    5.25160442       5.0835999       5.56683622
## 2    2.36806190       2.1076493       1.93232049
## 3    0.09375440       1.6380257       0.15378648
## 4   -0.84357535      -0.3446723      -0.79012502
## 5   -1.48940830      -1.0919631      -1.43637618
## 6   -0.04026466       1.5095165       0.01619519

Average embryo GFP+ sample reads

thresh = 1
sig = 0.01
embryo_rlog_status_df <- all_samples_rld_df %>% 
  select(WBGeneID, embryo_GFPplus_rep1, embryo_GFPplus_rep2, embryo_GFPplus_rep3) %>% 
  pivot_longer(cols = embryo_GFPplus_rep1:embryo_GFPplus_rep3, values_to = "rlog_counts") %>%
  separate(name, sep = "_", into = c("stage", "sample", "rep")) %>%
  group_by(WBGeneID) %>%
  summarise(mean.rlog.counts = mean(rlog_counts), var.rlog.counts = var(rlog_counts)) %>%
  left_join(as.data.frame(res_embryoGFPplus_vs_embryoGFPminus) %>% rownames_to_column(var = "WBGeneID"), by = "WBGeneID") %>%
  mutate(status = case_when(
    log2FoldChange > thresh & padj < sig ~ "enriched",
    log2FoldChange < thresh & padj < sig ~ "depleted",
    TRUE ~ "no_sig_diff",
  )) %>% 
  drop_na(padj)

head(embryo_rlog_status_df)
## # A tibble: 6 × 10
##   WBGeneID  mean.rlog.counts var.rlog.counts baseMean log2FoldChange lfcSE  stat
##   <chr>                <dbl>           <dbl>    <dbl>          <dbl> <dbl> <dbl>
## 1 WBGene00…            10.1          0.00376    1133.         -0.570 0.198 -2.87
## 2 WBGene00…            10.8          0.117       907.          1.88  0.477  3.94
## 3 WBGene00…             8.49         0.186       640.         -1.36  0.541 -2.51
## 4 WBGene00…            11.2          0.103      2633.         -1.39  0.560 -2.48
## 5 WBGene00…             8.77         0.0276      142.          7.60  0.718 10.6 
## 6 WBGene00…             8.14         0.242      2209.         -1.57  0.611 -2.57
## # … with 3 more variables: pvalue <dbl>, padj <dbl>, status <chr>
# write_csv(embryo_rlog_status_df, file = "../03_output/embryo_GFPplus_rlog_counts_status_df.csv", col_names = TRUE)
embryo_rlog_status_df <- read_csv(file = "../03_output/embryo_GFPplus_rlog_counts_status_df.csv", col_names = TRUE)
## Rows: 20856 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): WBGeneID, status
## dbl (8): mean.rlog.counts, var.rlog.counts, baseMean, log2FoldChange, lfcSE,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Correlate embryo expression to ELT-2 ChIP-seq

Import promoter ChIP-seq data

promoters.hilo <- read.table(file = "../../../David/01_promoters/03_output/promoters.hilo.tsv") %>% rownames_to_column(var = "WBGeneID") %>% select(WBGeneID:IDR_nlogq)
elt2chip_embryo_rlog_status_df <- promoters.hilo %>% full_join(embryo_rlog_status_df, by = "WBGeneID") %>% mutate(status = replace_na(status, "no_sig_diff"))
head(elt2chip_embryo_rlog_status_df)
##         WBGeneID seqnames    start      end width strand  len   wbps_gene_id
## 1 WBGene00007725     chrV 19410658 19411857  1200      - 1200 WBGene00007725
## 2 WBGene00044723    chrIV   670356   671555  1200      - 1200 WBGene00044723
## 3 WBGene00008044   chrIII  9318941  9320140  1200      + 1200 WBGene00008044
## 4 WBGene00010745    chrIV 12975303 12976502  1200      + 1200 WBGene00010745
## 5 WBGene00017667     chrX  1336976  1338175  1200      - 1200 WBGene00017667
## 6 WBGene00010128    chrIV 12961808 12963007  1200      + 1200 WBGene00010128
##   gene_name chip_signal_mean chip_signal_max log_chip_signal_mean
## 1   C25F9.5        28.178969        51.42597             7.018431
## 2 K11H12.11         1.787676         9.95639             6.690060
## 3   C40H1.9        10.463166        32.82366             6.806452
## 4    dod-17        -7.739326        19.86670             6.550399
## 5    ttr-37        20.813974        54.60923             6.934053
## 6  F55G11.8        29.777558        70.38104             7.036111
##   log_chip_signal_max IDR_mean  IDR_max IDR_value IDR_nlogq mean.rlog.counts
## 1            7.058353       NA       NA        NA        NA         8.390072
## 2            6.520639       NA       NA        NA        NA         7.516220
## 3            6.841483       NA       NA        NA        NA         6.792083
## 4            6.668518       NA       NA        NA        NA         7.201304
## 5            7.092404       NA       NA        NA        NA        10.813699
## 6            7.250189 57.92531 70.38104  68.89303  2.102826         8.710163
##   var.rlog.counts  baseMean log2FoldChange    lfcSE      stat       pvalue
## 1      0.79856753 359.88608       12.08042 1.327505  9.100099 9.024911e-20
## 2      0.18948951 103.19985       11.25492 1.422045  7.914607 2.480360e-15
## 3      1.16020641  93.78692       10.80597 1.496828  7.219245 5.227685e-13
## 4      0.06246691 529.85461       10.17742 1.658428  6.136787 8.420729e-10
## 5      0.16298455 900.43845       11.03947 0.961437 11.482264 1.619819e-30
## 6      0.29110842 516.20500       11.13789 1.311578  8.491977 2.031510e-17
##           padj   status
## 1 3.979356e-18 enriched
## 2 7.885729e-14 enriched
## 3 1.364563e-11 enriched
## 4 1.645949e-08 enriched
## 5 1.468823e-28 enriched
## 6 7.675576e-16 enriched
elt2chip_embryo_rlog_status_df %>% drop_na(IDR_mean) %>%
  ggplot(aes(x = mean.rlog.counts, y = log10(IDR_mean))) +
  geom_point(shape = 19, size = 0.5, alpha = 0.25, color = "black") +
  # geom_smooth(method = "lm", formula = y~x) +
  # stat_summary(fun.data = "mean_cl_boot") +
  ggpubr::stat_cor(method = "pearson") +
  facet_grid(~status) + 
  theme_bw()
## Warning: Removed 20 rows containing non-finite values (stat_cor).
## Warning: Removed 20 rows containing missing values (geom_point).

elt2chip_embryo_rlog_status_df %>% drop_na(IDR_mean) %>%
  ggplot(aes(x = mean.rlog.counts, y= log10(IDR_mean))) +
  geom_point(shape = 19, size = 0.5, alpha = 0.25, color = "black") +
  geom_smooth(method = "lm", formula = y~x) +
  ggpubr::stat_cor(method = "pearson", label.y = 3.1) +
  ggpubr::stat_regline_equation(label.y = 3) +
  geom_text(x = 4, y = 2.9, aes(label = gene_total), data = elt2chip_embryo_rlog_status_df %>% drop_na(IDR_mean) %>% group_by(status) %>% summarise(gene_total = paste0("# genes = ",n(), sep = ""))) +
  facet_grid(.~status) + 
  theme_classic()
## Warning: Removed 20 rows containing non-finite values (stat_smooth).
## Warning: Removed 20 rows containing non-finite values (stat_cor).
## Warning: Removed 20 rows containing non-finite values (stat_regline_equation).
## Warning: Removed 20 rows containing missing values (geom_point).

# ggsave(filename = "../03_output/PearsonCorrelation_embryoGFPplus_vs_ELT2_ChIP_Seq_Bound.pdf", width = 6, height = 3, dpi = 300)
elt2chip_embryo_rlog_status_df %>% 
  ggplot(aes(x = mean.rlog.counts, y= log_chip_signal_mean)) +
  geom_point(shape = 19, size = 0.25, alpha = 0.25, color = "black") +
  geom_smooth(method = "lm", formula = y~x) +
  ggpubr::stat_cor(method = "pearson", label.y = 11) +
  ggpubr::stat_regline_equation(label.y = 10) +
  geom_text(x = 5, y = 9, aes(label = gene_total), data = elt2chip_embryo_rlog_status_df %>% group_by(status) %>% summarise(gene_total = paste0("# genes = ",n(), sep = ""))) +
  facet_grid(.~status) + 
  theme_classic()
## Warning: Removed 10308 rows containing non-finite values (stat_smooth).
## Warning: Removed 10308 rows containing non-finite values (stat_cor).
## Warning: Removed 10308 rows containing non-finite values
## (stat_regline_equation).
## Warning: Removed 10308 rows containing missing values (geom_point).

ggsave(filename = "../03_output/PearsonCorrelation_embryoGFPplus_vs_ELT2_ChIP_Seq_All_Promoters.pdf", width = 6, height = 3, dpi = 300)
## Warning: Removed 10308 rows containing non-finite values (stat_smooth).
## Warning: Removed 10308 rows containing non-finite values (stat_cor).
## Warning: Removed 10308 rows containing non-finite values
## (stat_regline_equation).
## Warning: Removed 10308 rows containing missing values (geom_point).
elt2chip_embryo_rlog_status_df %>% 
  ggplot(aes(x = mean.rlog.counts)) +
  geom_histogram() +
  # geom_point(shape = 19, size = 0.5, alpha = 0.1) +
  # geom_smooth(method = "lm", formula = y~x) +
  # stat_summary(fun.data = "mean_cl_boot") +
  # ggpubr::stat_cor(method = "pearson") +
  facet_grid(~status)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1349 rows containing non-finite values (stat_bin).

statuses <- c("depleted", "enriched", "no_sig_diff")
for(i in 1:length(statuses)){
  print(statuses[i])
cor.test_result<- cor.test(x = elt2chip_embryo_rlog_status_df %>% filter(status == statuses[i]) %>% pull(mean.rlog.counts), 
         y = elt2chip_embryo_rlog_status_df %>% filter(status == statuses[i]) %>% pull(log_chip_signal_mean),
         method = "pearson")
cor_result<- cor(x = elt2chip_embryo_rlog_status_df %>% filter(status == statuses[i]) %>% pull(mean.rlog.counts), 
         y = elt2chip_embryo_rlog_status_df %>% filter(status == statuses[i]) %>% pull(log_chip_signal_mean),
         method = "pearson",
         use = "complete.obs")
print(cor.test_result)  
print(cor_result)  
}
## [1] "depleted"
## 
##  Pearson's product-moment correlation
## 
## data:  elt2chip_embryo_rlog_status_df %>% filter(status == statuses[i]) %>% pull(mean.rlog.counts) and elt2chip_embryo_rlog_status_df %>% filter(status == statuses[i]) %>% pull(log_chip_signal_mean)
## t = 14.236, df = 1157, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3359550 0.4340028
## sample estimates:
##       cor 
## 0.3860687 
## 
## [1] 0.3860687
## [1] "enriched"
## 
##  Pearson's product-moment correlation
## 
## data:  elt2chip_embryo_rlog_status_df %>% filter(status == statuses[i]) %>% pull(mean.rlog.counts) and elt2chip_embryo_rlog_status_df %>% filter(status == statuses[i]) %>% pull(log_chip_signal_mean)
## t = 17.265, df = 1699, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3451482 0.4260441
## sample estimates:
##       cor 
## 0.3863389 
## 
## [1] 0.3863389
## [1] "no_sig_diff"
## 
##  Pearson's product-moment correlation
## 
## data:  elt2chip_embryo_rlog_status_df %>% filter(status == statuses[i]) %>% pull(mean.rlog.counts) and elt2chip_embryo_rlog_status_df %>% filter(status == statuses[i]) %>% pull(log_chip_signal_mean)
## t = 51.863, df = 9035, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4629191 0.4946983
## sample estimates:
##       cor 
## 0.4789656 
## 
## [1] 0.4789656
yomamam<-lm(log_chip_signal_mean~mean.rlog.counts*status,data = elt2chip_embryo_rlog_status_df)
summary(yomamam)
## 
## Call:
## lm(formula = log_chip_signal_mean ~ mean.rlog.counts * status, 
##     data = elt2chip_embryo_rlog_status_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5529 -0.1828 -0.0243  0.1458  4.3056 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         6.4527549  0.0229022 281.753  < 2e-16 ***
## mean.rlog.counts                    0.0406101  0.0028087  14.459  < 2e-16 ***
## statusenriched                     -0.0239733  0.0347545  -0.690    0.490    
## statusno_sig_diff                  -0.0017537  0.0236389  -0.074    0.941    
## mean.rlog.counts:statusenriched     0.0189802  0.0039290   4.831 1.38e-06 ***
## mean.rlog.counts:statusno_sig_diff  0.0004909  0.0029320   0.167    0.867    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3154 on 11891 degrees of freedom
##   (10308 observations deleted due to missingness)
## Multiple R-squared:  0.2626, Adjusted R-squared:  0.2623 
## F-statistic: 846.9 on 5 and 11891 DF,  p-value: < 2.2e-16